PLEASE MAKE SURE TO INSTALL ALL THE BELOW COMENTED PACKAGES

# Load the necessary package (if not already installed)
#install.packages("corrplot")
#install.packages("skimr")
#install.packages("tidyverse")
#install.packages("Hmisc")
#install.packages("readr")
#install.packages("dplyr")
#install.packages("ggplot2")
#install.packages("tidyr")
#install.packages("scales")



library(corrplot)
## corrplot 0.92 loaded
library(skimr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(Hmisc)
## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(readr)
library(dplyr)
library(ggplot2)
library(tidyr)
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
# Set the URL
url <- "https://raw.githubusercontent.com/kwartler/Hult_Intro2R/main/A1_CerealEDA/cereals.csv"

# Read the CSV file and create a dataframe
data <- read_csv(url)
## Rows: 185 Columns: 66
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): cerealName, parsedName, brand, dietLabels, healthLabels, rawGPTRan...
## dbl (59): calories, Energy_kcal, Total.lipid..fat._g, Fatty.acids..total.sat...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Drop duplicates if incase any from the 'data' dataframe
data <- data[!duplicated(data), ]

# Drop the unecessary 'parsedName' column
data <- data[, !colnames(data) %in% "parsedName"]


# Print the dataframe
print(data)
## # A tibble: 185 × 65
##    cerealName              brand    dietLabels healthLabels calories Energy_kcal
##    <chr>                   <chr>    <chr>      <chr>           <dbl>       <dbl>
##  1 100% Bran Cereal        nabisco  HIGH_FIBE… LOW_FAT_ABS…      161       161. 
##  2 All Bran Bran Buds      kellog   HIGH_FIBE… LOW_FAT_ABS…      231       232. 
##  3 Almond Oatmeal Crisp    general… LOW_FAT, … FAT_FREE, L…      201       201. 
##  4 Apple Cinnamon Cheerios general… LOW_SODIUM LOW_FAT_ABS…      105       105. 
##  5 Banana Nut Crunch       post     LOW_CARB,… SUGAR_CONSC…      813       814. 
##  6 Barley, Wheat           quaker   HIGH_FIBE… LOW_FAT_ABS…      556       556. 
##  7 Blueberry Morning, Post post     LOW_FAT, … FAT_FREE, L…       84        84.4
##  8 Bran Cereal             nabisco  HIGH_FIBE… LOW_FAT_ABS…      161       161. 
##  9 Bran Flakes             kellog   HIGH_FIBE… LOW_FAT_ABS…      161       161. 
## 10 Bran, Raisin            quaker   LOW_FAT, … FAT_FREE, L…      433       434. 
## # ℹ 175 more rows
## # ℹ 59 more variables: Total.lipid..fat._g <dbl>,
## #   Fatty.acids..total.saturated_g <dbl>, Fatty.acids..total.trans_g <dbl>,
## #   Fatty.acids..total.monounsaturated_g <dbl>,
## #   Fatty.acids..total.polyunsaturated_g <dbl>,
## #   Carbohydrate..by.difference_g <dbl>, Carbohydrates..net._g <dbl>,
## #   Fiber..total.dietary_g <dbl>, Sugars..total_g <dbl>, Protein_g <dbl>, …
                                      ########################################
                                      #                                      #
                                      #           BASIC EDA                  #
                                      #                                      #
                                      ########################################
########################################
#                                      #
#       HISTOGRAM ANALYSIS             #
#                                      #
########################################


# Iterate over each column in the dataframe
for (col in names(data)) {
  # Check if the column is numeric
  if (is.numeric(data[[col]])) {
    # Generate histogram for numeric columns
    hist_plot <- ggplot(data, aes(x = !!sym(col))) +
      geom_histogram(aes(fill = ..count..), binwidth = 25, color = "black") +
      labs(x = col, y = "Count", title = paste("Histogram of", col)) +
      scale_fill_gradient(low = "blue", high = "red") +  # Make bars colorful
      theme_minimal()  # Simple white background
    
    # Print the histogram plot
    print(hist_plot)
  }
}
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Removed 64 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 2 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 17 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 5 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 3 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 3 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 3 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 3 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 17 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 26 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 2 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 5 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 3 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 3 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 3 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 17 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 26 rows containing non-finite values (`stat_bin()`).

########################################
#                                      #
#       GROUPBY ANALYSIS               #
#                                      #
########################################


# Group by brand and calculate the mean of calories and energy
group_by_brand <- data %>% 
  group_by(brand) %>% 
  summarise(mean_calories = mean(calories, na.rm = TRUE),
            mean_energy = mean(Energy_kcal, na.rm = TRUE))

print(group_by_brand)
## # A tibble: 5 × 3
##   brand        mean_calories mean_energy
##   <chr>                <dbl>       <dbl>
## 1 generalMills          300.        300.
## 2 kellog                246.        247.
## 3 nabisco               251.        251.
## 4 post                  426.        426.
## 5 quaker                478.        479.
ggplot(group_by_brand, aes(x = brand, y = mean_calories, fill = brand)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  labs(title = "Average Calories per Brand",
       x = "Brand",
       y = "Average Calories") +
  theme(legend.position = "none")

# Group by brand, dietLabels and calculate the mean of total fat and cholesterol
group_by_brand_diet <- data %>%
  group_by(brand, dietLabels) %>%
  summarise(mean_fat = mean(Total.lipid..fat._g, na.rm = TRUE),
            mean_cholesterol = mean(Cholesterol_mg, na.rm = TRUE))
## `summarise()` has grouped output by 'brand'. You can override using the
## `.groups` argument.
ggplot(group_by_brand_diet, aes(x = brand, y = mean_fat, fill = dietLabels)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal() +
  labs(title = "Average Fat per Brand by Diet Labels",
       x = "Brand",
       y = "Average Fat")

# Start by counting the occurrences of each 'dietLabels' value in the 'data' dataframe
dietLabels_counts <- data %>% 
  dplyr::group_by(dietLabels) %>%
  dplyr::summarise(n = dplyr::n())

# Get the unique values of 'dietLabels'
unique_labels <- unique(dietLabels_counts$dietLabels)

# Define custom colors
my_colors <- c("#FF0000", "#00FF00", "#0000FF", "#FFFF00", "#FF00FF", "#00FFFF", "#FFA500", "#008000", "#800080")

# Create a bar chart using the ggplot2 library and specify the data and aesthetics
ggplot2::ggplot(dietLabels_counts, ggplot2::aes(x = reorder(dietLabels, n), y = n, fill = dietLabels)) +
  # 'geom_bar' creates a bar chart with bars filled according to 'dietLabels'
  ggplot2::geom_bar(stat = "identity") +
  # 'labs' function is used to add labels to the x-axis, y-axis, and the chart title
  ggplot2::labs(x = "Diet Labels", y = "Frequency", title = "Frequency Count of Diet Labels") +
  # 'theme' function is used to customize the appearance of the chart, here we're adjusting the angle and justification of x-axis labels
  ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 45, hjust = 1)) +
  # 'scale_fill_manual' function allows us to manually set the fill colors for the bars
  ggplot2::scale_fill_manual(values = my_colors[1:length(unique_labels)])

########################################
#                                      #
#       HEALTH ANALYSIS                #
#                                      #
########################################

# This line takes the healthLabels column from the data dataset and splits the values using a comma as the delimiter. The resulting values are stored in the all_labels variable as a character vector.
all_labels <- unlist(strsplit(data$healthLabels, ", "))

# Here, the unique() function is used to extract unique values from the all_labels vector. These unique labels are stored in the unique_labels variable.
unique_labels <- unique(all_labels)
#print(unique_labels)


# This loop iterates over each unique label in unique_labels and prints it using the cat() function. Each label is printed on a new line.
#for (label in unique_labels) {
#  cat(label, "\n")
#}



# This line creates a logical matrix encoded_data by applying the grepl() function to each unique label in unique_labels. The grepl() function checks if each label exists in the healthLabels column of the data dataset. The result is a matrix with TRUE values where the label is present and FALSE values where it is not.
encoded_data <- sapply(unique_labels, function(label) grepl(label, data$healthLabels))

# The as.data.frame() function is used to convert the logical matrix encoded_data into a data frame. This allows for easier manipulation and analysis of the encoded data.
encoded_data <- as.data.frame(encoded_data)


# This line combines the original data dataset with the newly created encoded_data data frame column-wise using the cbind() function. The resulting data dataset now contains the original columns along with the encoded labels.
data <- cbind(data, encoded_data)


# Here, a character vector column_names is created, containing the names of specific health labels. These names correspond to the encoded columns in the data dataset.
column_names <- c("LOW_FAT_ABS",
                  "VEGAN",
                  "VEGETARIAN",
                  "PESCATARIAN",
                  "MEDITERRANEAN",
                  "DAIRY_FREE",
                  "EGG_FREE",
                  "MILK_FREE",
                  "PEANUT_FREE",
                  "TREE_NUT_FREE",
                  "SOY_FREE",
                  "FISH_FREE",
                  "SHELLFISH_FREE",
                  "PORK_FREE",
                  "RED_MEAT_FREE",
                  "CRUSTACEAN_FREE",
                  "CELERY_FREE",
                  "MUSTARD_FREE",
                  "SESAME_FREE",
                  "LUPINE_FREE",
                  "MOLLUSK_FREE",
                  "ALCOHOL_FREE",
                  "NO_OIL_ADDED",
                  "SULPHITE_FREE",
                  "KOSHER",
                  "FAT_FREE",
                  "SUGAR_CONSCIOUS",
                  "LOW_POTASSIUM",
                  "KIDNEY_FRIENDLY",
                  "WHEAT_FREE",
                  "LOW_SUGAR",
                  "KETO_FRIENDLY",
                  "DASH",
                  "GLUTEN_FREE",
                  "NO_SUGAR_ADDED",
                  "FODMAP_FREE",
                  "PALEO",
                  "SPECIFIC_CARBS")


#This line calculates the column-wise means for the columns specified in column_names from the data dataset using the colMeans() function. The resulting means are multiplied by 100 and stored in the percentages variable.
percentages <- colMeans(data[column_names]) * 100


#Finally, the percentages variable is printed, displaying the calculated percentages for each health label in the column_names columns of the data dataset.
print(percentages)
##     LOW_FAT_ABS           VEGAN      VEGETARIAN     PESCATARIAN   MEDITERRANEAN 
##       76.216216       89.189189      100.000000      100.000000       57.297297 
##      DAIRY_FREE        EGG_FREE       MILK_FREE     PEANUT_FREE   TREE_NUT_FREE 
##       97.297297       98.918919       98.378378       92.972973       82.702703 
##        SOY_FREE       FISH_FREE  SHELLFISH_FREE       PORK_FREE   RED_MEAT_FREE 
##       97.837838      100.000000      100.000000      100.000000      100.000000 
## CRUSTACEAN_FREE     CELERY_FREE    MUSTARD_FREE     SESAME_FREE     LUPINE_FREE 
##      100.000000      100.000000      100.000000      100.000000      100.000000 
##    MOLLUSK_FREE    ALCOHOL_FREE    NO_OIL_ADDED   SULPHITE_FREE          KOSHER 
##      100.000000      100.000000       76.216216       92.432432      100.000000 
##        FAT_FREE SUGAR_CONSCIOUS   LOW_POTASSIUM KIDNEY_FRIENDLY      WHEAT_FREE 
##       27.567568       41.621622       58.378378       81.621622       43.243243 
##       LOW_SUGAR   KETO_FRIENDLY            DASH     GLUTEN_FREE  NO_SUGAR_ADDED 
##        9.189189       11.351351       32.972973       40.540541       25.945946 
##     FODMAP_FREE           PALEO  SPECIFIC_CARBS 
##       18.378378       13.513514       12.972973
# Specify the logical columns of interest
logical_columns <- c("LOW_FAT_ABS", "VEGAN", "VEGETARIAN", "PESCATARIAN", "MEDITERRANEAN", "DAIRY_FREE",
                     "EGG_FREE", "MILK_FREE", "PEANUT_FREE", "TREE_NUT_FREE", "SOY_FREE", "FISH_FREE",
                     "SHELLFISH_FREE", "PORK_FREE", "RED_MEAT_FREE", "CRUSTACEAN_FREE", "CELERY_FREE",
                     "MUSTARD_FREE", "SESAME_FREE", "LUPINE_FREE", "MOLLUSK_FREE", "ALCOHOL_FREE",
                     "NO_OIL_ADDED", "SULPHITE_FREE", "KOSHER", "FAT_FREE", "SUGAR_CONSCIOUS", "LOW_POTASSIUM",
                     "KIDNEY_FRIENDLY", "WHEAT_FREE", "LOW_SUGAR", "KETO_FRIENDLY", "DASH", "GLUTEN_FREE",
                     "NO_SUGAR_ADDED", "FODMAP_FREE", "PALEO", "SPECIFIC_CARBS")


# Loop through each logical column
for(col in logical_columns){
  # Subset the data for only the brand and the current logical column
  data_logical <- data[, c("brand", col)]
  
  # Reshape the data to long format for plotting
  data_logical_long <- data_logical %>%
    tidyr::gather(key = "column", value = "logical_value", -brand)
  
  # Create the bar plot
  plot_logical <- ggplot(data_logical_long, aes(x = brand, fill = logical_value)) +
    geom_bar(position = "fill") +
    labs(x = "Brand", y = "Proportion", title = paste("Proportion of", col, "by Brand")) +
    facet_wrap(~column, scales = "free_x", nrow = 1) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

  # Print the bar plot
  print(plot_logical)
}

########################################
#                                      #
#          DIET ANALYSIS               #
#                                      #
########################################

# This line takes the dietLabels column from the data dataset and splits the values using a comma as the delimiter. The resulting values are stored in the all_labels variable as a character vector.
all_dietLabels <- unlist(strsplit(data$dietLabels, ", "))

# Here, the unique() function is used to extract unique values from the all_dietLabels vector. These unique dietLabels are stored in the unique_dietLabels variable.
unique_dietLabels <- unique(all_dietLabels)
#print(unique_dietLabels)


# This loop iterates over each unique label in unique_dietLabels and prints it using the cat() function. Each label is printed on a new line.
#for (label in unique_dietLabels) {
#  cat(label, "\n")
#}



# This line creates a logical matrix encoded_data by applying the grepl() function to each unique label in unique_labels. The grepl() function checks if each dietLabel exists in the dietLabels column of the data dataset. The result is a matrix with TRUE values where the label is present and FALSE values where it is not.
encoded_data <- sapply(unique_dietLabels, function(label) grepl(label, data$dietLabels))

# The as.data.frame() function is used to convert the logical matrix encoded_data into a data frame. This allows for easier manipulation and analysis of the encoded data.
encoded_data <- as.data.frame(encoded_data)


# This line combines the original data dataset with the newly created encoded_data data frame column-wise using the cbind() function. The resulting data dataset now contains the original columns along with the encoded labels.
data <- cbind(data, encoded_data)

# Here, a character vector column_names is created, containing the names of specific health labels. These names correspond to the encoded columns in the data dataset.
column_names <- c("HIGH_FIBER",
                  "LOW_SODIUM",
                  "LOW_FAT",
                  "LOW_CARB",
                  "BALANCED")

#This line calculates the column-wise means for the columns specified in column_names from the data dataset using the colMeans() function. The resulting means are multiplied by 100 and stored in the percentages variable.
percentages <- colMeans(data[column_names]) * 100


#Finally, the percentages variable is printed, displaying the calculated percentages for each health label in the column_names columns of the data dataset.
print(percentages)
## HIGH_FIBER LOW_SODIUM    LOW_FAT   LOW_CARB   BALANCED 
##  17.837838  92.432432  63.243243  15.135135   3.243243
# Select the relevant columns for analysis
columns <- c("brand", "HIGH_FIBER", "LOW_SODIUM", "LOW_FAT", "LOW_CARB", "BALANCED")
data_logical <- data[, columns]

# Reshape the data to long format for plotting
data_logical_long <- data_logical %>%
  tidyr::gather(key = "column", value = "logical_value", -brand)

# Create the bar plot
plot_logical <- ggplot(data_logical_long, aes(x = brand, fill = logical_value)) +
  geom_bar(position = "fill") +
  labs(x = "Brand", y = "Proportion", title = "Proportion of Logical Values by Brand") +
  facet_wrap(~column, scales = "free_x") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Print the bar plot
print(plot_logical)

########################################
#                                      #
#          NUTRIENT ANALYSIS           #
#                                      #
########################################

# Set the URL
url <- "https://raw.githubusercontent.com/kwartler/Hult_Intro2R/main/A1_CerealEDA/cereals.csv"

# Read the CSV file and create a dataframe
data <- read_csv(url)
## Rows: 185 Columns: 66
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): cerealName, parsedName, brand, dietLabels, healthLabels, rawGPTRan...
## dbl (59): calories, Energy_kcal, Total.lipid..fat._g, Fatty.acids..total.sat...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Generate summary table for the 'data' dataframe
summary_table <- skim(data)

# Print the summary table
print(summary_table)
## ── Data Summary ────────────────────────
##                            Values
## Name                       data  
## Number of rows             185   
## Number of columns          66    
## _______________________          
## Column type frequency:           
##   character                7     
##   numeric                  59    
## ________________________         
## Group variables            None  
## 
## ── Variable type: character ────────────────────────────────────────────────────
##   skim_variable  n_missing complete_rate min max empty n_unique whitespace
## 1 cerealName             0         1       3  45     0      185          0
## 2 parsedName             0         1       8   8     0        1          0
## 3 brand                  0         1       4  12     0        5          0
## 4 dietLabels             2         0.989   7  32     0        9          0
## 5 healthLabels           0         1     294 447     0       60          0
## 6 rawGPTRank             0         1       1 264     0       39          0
## 7 gptDescription         0         1     105 312     0      185          0
## 
## ── Variable type: numeric ──────────────────────────────────────────────────────
##    skim_variable                        n_missing complete_rate     mean      sd
##  1 calories                                     0         1     340.     306.   
##  2 Energy_kcal                                  0         1     340.     306.   
##  3 Total.lipid..fat._g                          0         1      13.3     28.9  
##  4 Fatty.acids..total.saturated_g               0         1       2.51     9.36 
##  5 Fatty.acids..total.trans_g                  64         0.654   0.0790   0.678
##  6 Fatty.acids..total.monounsaturated_g         0         1       6.27    14.8  
##  7 Fatty.acids..total.polyunsaturated_g         0         1       3.69     7.66 
##  8 Carbohydrate..by.difference_g                0         1      54.3     45.8  
##  9 Carbohydrates..net._g                        0         1      44.8     43.7  
## 10 Fiber..total.dietary_g                       2         0.989   9.57    11.8  
## 11 Sugars..total_g                             17         0.908  18.0     41.6  
## 12 Protein_g                                    0         1       9.08    10.0  
## 13 Cholesterol_mg                               0         1       2.64    35.9  
## 14 Sodium..Na_mg                                0         1     119.     125.   
## 15 Calcium..Ca_mg                               0         1      98.8    185.   
## 16 Magnesium..Mg_mg                             0         1     110.     119.   
## 17 Potassium..K_mg                              0         1     402.     367.   
## 18 Iron..Fe_mg                                  0         1       6.13     4.63 
## 19 Zinc..Zn_mg                                  0         1       3.51     2.69 
## 20 Phosphorus..P_mg                             0         1     284.     279.   
## 21 Vitamin.A..RAE_µg                            5         0.973 118.     168.   
## 22 Vitamin.C..total.ascorbic.acid_mg            0         1       4.13     5.15 
## 23 Thiamin_mg                                   0         1       0.496    0.408
## 24 Riboflavin_mg                                0         1       0.404    0.387
## 25 Niacin_mg                                    0         1       5.52     4.94 
## 26 Vitamin.B.6_mg                               0         1       0.816    1.42 
## 27 Folate..DFE_µg                               3         0.984 238.     341.   
## 28 Folate..food_µg                              0         1      32.3     52.5  
## 29 Folic.acid_µg                                3         0.984 120.     205.   
## 30 Vitamin.B.12_µg                              3         0.984   1.48     2.94 
## 31 Vitamin.D..D2...D3._µg                       3         0.984   0.712    0.916
## 32 Vitamin.E..alpha.tocopherol._mg             17         0.908   2.23     6.12 
## 33 Vitamin.K..phylloquinone._µg                26         0.859   3.26     7.02 
## 34 Water_g                                      0         1      10.4     25.7  
## 35 Energy_pct                                   0         1      17.0     15.3  
## 36 Fat_pct                                      0         1      20.5     44.5  
## 37 Saturated_pct                                0         1      12.6     46.8  
## 38 Carbs_pct                                    0         1      18.1     15.3  
## 39 Fiber_pct                                    2         0.989  38.3     47.3  
## 40 Protein_pct                                  0         1      18.2     20.0  
## 41 Cholesterol_pct                              0         1       0.879   12.0  
## 42 Sodium_pct                                   0         1       4.97     5.21 
## 43 Calcium_pct                                  0         1       9.89    18.5  
## 44 Magnesium_pct                                0         1      26.2     28.3  
## 45 Potassium_pct                                0         1       8.55     7.81 
## 46 Iron_pct                                     0         1      34.1     25.7  
## 47 Zinc_pct                                     0         1      32.0     24.5  
## 48 Phosphorus_pct                               0         1      40.5     39.9  
## 49 Vitamin.A_pct                                5         0.973  13.2     18.7  
## 50 Vitamin.C_pct                                0         1       4.59     5.72 
## 51 Thiamin..B1._pct                             0         1      41.5     34.0  
## 52 Riboflavin..B2._pct                          0         1      31.0     29.7  
## 53 Niacin..B3._pct                              0         1      34.5     30.9  
## 54 Vitamin.B6_pct                               0         1      62.7    109.   
## 55 Folate.equivalent..total._pct                3         0.984  59.4     85.4  
## 56 Vitamin.B12_pct                              3         0.984  61.6    123.   
## 57 Vitamin.D_pct                                3         0.984   4.75     6.11 
## 58 Vitamin.E_pct                               17         0.908  14.9     40.8  
## 59 Vitamin.K_pct                               26         0.859   2.72     5.85 
##       p0     p25    p50    p75    p100 hist 
##  1 26    141     196    433    1627    ▇▂▁▁▁
##  2 26.2  141.    196.   434.   1628.   ▇▂▁▁▁
##  3  0      0.83    1.81   6.61  184.   ▇▁▁▁▁
##  4  0      0.18    0.4    1.25  117.   ▇▁▁▁▁
##  5  0      0       0      0.01    7.44 ▇▁▁▁▁
##  6  0      0.15    0.41   1.92   66.9  ▇▁▁▁▁
##  7  0      0.31    0.64   1.84   56.6  ▇▁▁▁▁
##  8  0.14  29.1    40.7   59.6   279.   ▇▂▁▁▁
##  9  0.14  22.4    33.8   44.7   279.   ▇▁▁▁▁
## 10  0      2.58    6.31  12.3    66.3  ▇▂▁▁▁
## 11  0.1    1.49    6.22  16.3   278.   ▇▁▁▁▁
## 12  0.17   2.99    4.98  12.6    57.3  ▇▂▁▁▁
## 13  0      0       0      0     488.   ▇▁▁▁▁
## 14  0     13.6   116.   210.    617.   ▇▆▁▁▁
## 15  1.2   25.4    56.8  110.   1250.   ▇▁▁▁▁
## 16  3.2   26.6    65.2  212.    456.   ▇▁▂▁▁
## 17 17.6  122     211.   659.   1499.   ▇▂▂▁▁
## 18  0.05   2.5     5.39   8.47   32.5  ▇▅▁▁▁
## 19  0.04   1.49    3.02   5      14.1  ▇▇▂▁▁
## 20  5.8   80.0   163.   556.   1314    ▇▁▂▁▁
## 21  0      0       8.75 228.   1553.   ▇▁▁▁▁
## 22  0      0       1.46   7.99   21.1  ▇▃▁▁▁
## 23  0      0.22    0.43   0.65    2.16 ▇▆▂▁▁
## 24  0      0.13    0.27   0.58    1.68 ▇▆▁▁▁
## 25  0.04   1.88    5.17   7.01   33.8  ▇▃▁▁▁
## 26  0.01   0.19    0.5    0.68    7.44 ▇▁▁▁▁
## 27  1.45  29.4   158.   318.   2015.   ▇▁▁▁▁
## 28  0.5    7.09   12.1   27.5   323.   ▇▁▁▁▁
## 29  0      0      13.8  156.   1164.   ▇▁▁▁▁
## 30  0      0       0      1.91   18.0  ▇▁▁▁▁
## 31  0      0       0      1.32    3.4  ▇▃▁▂▁
## 32  0      0.17    0.34   0.95   36.6  ▇▁▁▁▁
## 33  0      0.48    1.01   3.22   38.9  ▇▁▁▁▁
## 34  0.16   1.34    2.58   6.31  224.   ▇▁▁▁▁
## 35  1.31   7.06    9.8   21.7    81.4  ▇▂▁▁▁
## 36  0      1.28    2.78  10.2   283.   ▇▁▁▁▁
## 37  0      0.9     1.98   6.24  583.   ▇▁▁▁▁
## 38  0.05   9.71   13.6   19.9    93.1  ▇▂▁▁▁
## 39  0     10.3    25.2   49.3   265.   ▇▂▁▁▁
## 40  0.35   5.98    9.96  25.3   115.   ▇▂▁▁▁
## 41  0      0       0      0     163.   ▇▁▁▁▁
## 42  0      0.56    4.85   8.75   25.7  ▇▆▁▁▁
## 43  0.12   2.54    5.68  11.0   125.   ▇▁▁▁▁
## 44  0.76   6.34   15.5   50.4   109.   ▇▁▂▁▁
## 45  0.38   2.6     4.48  14.0    31.9  ▇▂▂▁▁
## 46  0.25  13.9    30.0   47.0   180.   ▇▅▁▁▁
## 47  0.4   13.6    27.4   45.5   128.   ▇▇▂▁▁
## 48  0.83  11.4    23.3   79.5   188.   ▇▁▂▁▁
## 49  0      0       0.97  25.3   173.   ▇▁▁▁▁
## 50  0      0       1.63   8.88   23.4  ▇▃▁▁▁
## 51  0     18.6    36     54.4   180.   ▇▆▂▁▁
## 52  0      9.66   21.1   44.5   129.   ▇▆▁▁▁
## 53  0.28  11.7    32.3   43.8   211.   ▇▃▁▁▁
## 54  0.52  14.3    38.4   52.6   572.   ▇▁▁▁▁
## 55  0.36   7.35   39.5   79.4   504.   ▇▁▁▁▁
## 56  0      0       0     79.5   751.   ▇▁▁▁▁
## 57  0      0       0      8.82   22.7  ▇▃▁▂▁
## 58  0      1.16    2.23   6.33  244.   ▇▁▁▁▁
## 59  0      0.395   0.84   2.69   32.4  ▇▁▁▁▁
data <- data[, c("cerealName", "brand", "calories", "Energy_kcal", "Total.lipid..fat._g", "Fatty.acids..total.saturated_g", "Fatty.acids..total.trans_g", "Fatty.acids..total.monounsaturated_g", "Fatty.acids..total.polyunsaturated_g", "Carbohydrate..by.difference_g", "Carbohydrates..net._g", "Fiber..total.dietary_g", "Sugars..total_g", "Protein_g", "Cholesterol_mg", "Sodium..Na_mg", "Calcium..Ca_mg", "Magnesium..Mg_mg", "Potassium..K_mg", "Iron..Fe_mg", "Zinc..Zn_mg", "Phosphorus..P_mg", "Vitamin.A..RAE_µg", "Vitamin.C..total.ascorbic.acid_mg", "Thiamin_mg", "Riboflavin_mg", "Niacin_mg", "Vitamin.B.6_mg", "Folate..DFE_µg", "Folate..food_µg", "Folic.acid_µg", "Vitamin.B.12_µg", "Vitamin.D..D2...D3._µg", "Vitamin.E..alpha.tocopherol._mg", "Vitamin.K..phylloquinone._µg", "Water_g", "Energy_pct", "Fat_pct", "Saturated_pct", "Carbs_pct", "Fiber_pct", "Protein_pct", "Cholesterol_pct", "Sodium_pct", "Calcium_pct", "Magnesium_pct", "Potassium_pct", "Iron_pct", "Zinc_pct", "Phosphorus_pct", "Vitamin.A_pct", "Vitamin.C_pct", "Thiamin..B1._pct", "Riboflavin..B2._pct", "Niacin..B3._pct", "Vitamin.B6_pct", "Folate.equivalent..total._pct", "Vitamin.B12_pct", "Vitamin.D_pct", "Vitamin.E_pct", "Vitamin.K_pct")]
########################################
#                                      #
# NUTRIENT:  ENERGY CONTENT ANALYSIS   #
#                                      #
########################################
# Top and bottom 5 cereals based on energy content

top_5_energy <- data %>%
  arrange(desc(Energy_kcal)) %>%
  head(5)

bottom_5_energy <- data %>%
  arrange(Energy_kcal) %>%
  head(5)

print(top_5_energy)
## # A tibble: 5 × 61
##   cerealName                      brand calories Energy_kcal Total.lipid..fat._g
##   <chr>                           <chr>    <dbl>       <dbl>               <dbl>
## 1 Puffs, Peenut Butter            gene…     1627       1628.              184.  
## 2 General Mills Reese's Peanut B… gene…     1542       1543.              133.  
## 3 Peanut Butter Puffs - Reese's … gene…     1542       1543.              133.  
## 4 Reese's Peanut Butter Puffs     gene…     1542       1543.              133.  
## 5 Post Wheat and Barley           post      1207       1207.                6.85
## # ℹ 56 more variables: Fatty.acids..total.saturated_g <dbl>,
## #   Fatty.acids..total.trans_g <dbl>,
## #   Fatty.acids..total.monounsaturated_g <dbl>,
## #   Fatty.acids..total.polyunsaturated_g <dbl>,
## #   Carbohydrate..by.difference_g <dbl>, Carbohydrates..net._g <dbl>,
## #   Fiber..total.dietary_g <dbl>, Sugars..total_g <dbl>, Protein_g <dbl>,
## #   Cholesterol_mg <dbl>, Sodium..Na_mg <dbl>, Calcium..Ca_mg <dbl>, …
print(bottom_5_energy)
## # A tibble: 5 × 61
##   cerealName                      brand calories Energy_kcal Total.lipid..fat._g
##   <chr>                           <chr>    <dbl>       <dbl>               <dbl>
## 1 Post Fruit & Fiber Dates, Rais… post        26        26.2                2.61
## 2 Pebbles, Fruit                  nabi…       73        73.5                0.17
## 3 Blueberry Morning, Post         post        84        84.4                0.49
## 4 Post Blueberry Morning          post        84        84.4                0.49
## 5 General Mills Kix               gene…       85        85.7                0.83
## # ℹ 56 more variables: Fatty.acids..total.saturated_g <dbl>,
## #   Fatty.acids..total.trans_g <dbl>,
## #   Fatty.acids..total.monounsaturated_g <dbl>,
## #   Fatty.acids..total.polyunsaturated_g <dbl>,
## #   Carbohydrate..by.difference_g <dbl>, Carbohydrates..net._g <dbl>,
## #   Fiber..total.dietary_g <dbl>, Sugars..total_g <dbl>, Protein_g <dbl>,
## #   Cholesterol_mg <dbl>, Sodium..Na_mg <dbl>, Calcium..Ca_mg <dbl>, …
# Histogram for energy content
ggplot(data, aes(x = Energy_kcal)) +
  geom_histogram(binwidth = 10, fill = "blue", color = "black") +
  xlab("Energy content (kcal)") +
  ylab("Frequency") +
  ggtitle("Distribution of energy content in ALL Cereals")

##################################################
#                                                #
# NUTRIENT:Sugar  ANALYSIS                       #
#                                                #
##################################################

# Calculate average sugar level
avg_sugar <- data %>% 
  summarise(avg_sugar = mean(Sugars..total_g, na.rm = TRUE))

# Cereals with high sugar content
high_sugar_cereals <- data %>% 
  filter(Sugars..total_g > avg_sugar$avg_sugar)


print(avg_sugar)
## # A tibble: 1 × 1
##   avg_sugar
##       <dbl>
## 1      18.0
print(high_sugar_cereals)
## # A tibble: 41 × 61
##    cerealName                     brand calories Energy_kcal Total.lipid..fat._g
##    <chr>                          <chr>    <dbl>       <dbl>               <dbl>
##  1 All Bran Bran Buds             kell…      231        232.                2.25
##  2 Bran, Raisin                   quak…      433        434.                0.67
##  3 Corn Flakes, Honey Crunch      kell…     1030       1031.                0   
##  4 Cracklin' Oat Bran             kell…      258        258.                9.21
##  5 Crispy Wheat with Raisins      quak…      433        434.                0.67
##  6 General Mills Raisin Nut Bran  gene…      240        240.                3.46
##  7 General Mills Reese's Peanut … gene…     1542       1543.              133.  
##  8 General Mills Total Raisin Br… gene…      187        188.                1.6 
##  9 General Mills Wheaties Raisin… gene…      187        188.                1.6 
## 10 Grahams, Golden                post       361        361.                8.9 
## # ℹ 31 more rows
## # ℹ 56 more variables: Fatty.acids..total.saturated_g <dbl>,
## #   Fatty.acids..total.trans_g <dbl>,
## #   Fatty.acids..total.monounsaturated_g <dbl>,
## #   Fatty.acids..total.polyunsaturated_g <dbl>,
## #   Carbohydrate..by.difference_g <dbl>, Carbohydrates..net._g <dbl>,
## #   Fiber..total.dietary_g <dbl>, Sugars..total_g <dbl>, Protein_g <dbl>, …
##################################################
#                                                #
# NUTRIENT:FIBER  ANALYSIS                       #
#                                                #
##################################################

# Calculate average fiber content
avg_fiber <- data %>% 
  summarise(avg_fiber = mean(Fiber..total.dietary_g, na.rm = TRUE))

# Cereals with high fiber content
high_fiber_cereals <- data %>% 
  filter(Fiber..total.dietary_g > avg_fiber$avg_fiber)

print(avg_fiber)
## # A tibble: 1 × 1
##   avg_fiber
##       <dbl>
## 1      9.57
print(high_fiber_cereals)
## # A tibble: 56 × 61
##    cerealName              brand        calories Energy_kcal Total.lipid..fat._g
##    <chr>                   <chr>           <dbl>       <dbl>               <dbl>
##  1 100% Bran Cereal        nabisco           161        161.                3.04
##  2 All Bran Bran Buds      kellog            231        232.                2.25
##  3 Banana Nut Crunch       post              813        814.               70.5 
##  4 Barley, Wheat           quaker            556        556.                2.62
##  5 Bran Cereal             nabisco           161        161.                3.04
##  6 Bran Flakes             kellog            161        161.                3.04
##  7 Cheerios-Apple Cinnamon generalMills      308        308.                1.55
##  8 Cheerios-Honey Nut      generalMills      813        814.               70.5 
##  9 Chex, Honey Nut         generalMills      813        814.               70.5 
## 10 Chex, Wheat             generalMills      556        556.                2.62
## # ℹ 46 more rows
## # ℹ 56 more variables: Fatty.acids..total.saturated_g <dbl>,
## #   Fatty.acids..total.trans_g <dbl>,
## #   Fatty.acids..total.monounsaturated_g <dbl>,
## #   Fatty.acids..total.polyunsaturated_g <dbl>,
## #   Carbohydrate..by.difference_g <dbl>, Carbohydrates..net._g <dbl>,
## #   Fiber..total.dietary_g <dbl>, Sugars..total_g <dbl>, Protein_g <dbl>, …
##################################################
#                                                #
# NUTRIENT:CORRELATION fat_pct,cholesterol_mg    #
#                                                #
##################################################
# Calculate correlation between fat_pct and cholesterol_mg
correlation <- cor(data$Fat_pct, data$Cholesterol_mg, use = "complete.obs")
print(correlation)
## [1] 0.4365212
##################################################
#                                                #
# NUTRIENT:BRAND fat_pct,cholesterol_mg          #
#                                                #
##################################################
# Mean nutrient values by brand
nutrient_by_brand <- data %>% 
  group_by(brand) %>% 
  summarise(across(where(is.numeric), mean, na.rm = TRUE))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(where(is.numeric), mean, na.rm = TRUE)`.
## ℹ In group 1: `brand = "generalMills"`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
## 
##   # Previously
##   across(a:b, mean, na.rm = TRUE)
## 
##   # Now
##   across(a:b, \(x) mean(x, na.rm = TRUE))
print(nutrient_by_brand)
## # A tibble: 5 × 60
##   brand        calories Energy_kcal Total.lipid..fat._g Fatty.acids..total.sat…¹
##   <chr>           <dbl>       <dbl>               <dbl>                    <dbl>
## 1 generalMills     300.        300.               14.2                      3.70
## 2 kellog           246.        247.                4.24                     1.20
## 3 nabisco          251.        251.                3.56                     1.08
## 4 post             426.        426.               18.8                      2.05
## 5 quaker           478.        479.               19.2                      2.60
## # ℹ abbreviated name: ¹​Fatty.acids..total.saturated_g
## # ℹ 55 more variables: Fatty.acids..total.trans_g <dbl>,
## #   Fatty.acids..total.monounsaturated_g <dbl>,
## #   Fatty.acids..total.polyunsaturated_g <dbl>,
## #   Carbohydrate..by.difference_g <dbl>, Carbohydrates..net._g <dbl>,
## #   Fiber..total.dietary_g <dbl>, Sugars..total_g <dbl>, Protein_g <dbl>,
## #   Cholesterol_mg <dbl>, Sodium..Na_mg <dbl>, Calcium..Ca_mg <dbl>, …
ggplot(data, aes(x = Fiber..total.dietary_g)) +
  geom_histogram(binwidth = 1, fill = "blue", color = "black") +
  xlab("Fiber content (g)") +
  ylab("Frequency") +
  ggtitle("Distribution of fiber content in cereals")
## Warning: Removed 2 rows containing non-finite values (`stat_bin()`).

ggplot(data, aes(x = Fat_pct, y = Cholesterol_pct)) +
  geom_point() +
  xlab("Fat Percentage") +
  ylab("Cholesterol (mg)") +
  ggtitle("Scatter plot of Fat Percentage vs Cholesterol")

##################################################
#                                                #
# CORRELATION ANALYISIS                          #
#                                                #
##################################################

# Select only numeric columns from the dataframe
numeric_data <- data[, sapply(data, is.numeric)]

# Compute the correlation matrix
correlation_matrix <- cor(numeric_data, use = "pairwise.complete.obs")

# Increase the size of the correlation plot
corrplot(correlation_matrix, method = "color", tl.cex = 0.8, cl.cex = 0.8, mar = c(1, 1, 3, 4))

                                ##################################################
                                #                                                #
                                #          REGRESSION  ANALYISIS                 #
                                #                                                #
                                ##################################################
                                ##################################################
                                #                                                #
                                #                   CLEANING                     #
                                #                                                #
                                ##################################################
# Set the URL
url <- "https://raw.githubusercontent.com/kwartler/Hult_Intro2R/main/A1_CerealEDA/cereals.csv"

# Read the CSV file and create a dataframe
data <- read_csv(url)
## Rows: 185 Columns: 66
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): cerealName, parsedName, brand, dietLabels, healthLabels, rawGPTRan...
## dbl (59): calories, Energy_kcal, Total.lipid..fat._g, Fatty.acids..total.sat...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Drop rows with more than 2 NA values in the row 
data <- data[rowSums(is.na(data)) <= 2,]

# Perform mean imputation on remaining NA values
data_imputed <- data %>%
  mutate(across(where(is.numeric), ~impute(.x, mean)))
                                ##################################################
                                #                                                #
                                #                   ONE HOT ENCODING             #
                                #                                                #
                                ##################################################

########################################
#                                      #
#       HEALTH                         #
#                                      #
########################################

# This line takes the healthLabels column from the data dataset and splits the values using a comma as the delimiter. The resulting values are stored in the all_labels variable as a character vector.
all_labels <- unlist(strsplit(data$healthLabels, ", "))

# Here, the unique() function is used to extract unique values from the all_labels vector. These unique labels are stored in the unique_labels variable.
unique_labels <- unique(all_labels)



# This line creates a logical matrix encoded_data by applying the grepl() function to each unique label in unique_labels. The grepl() function checks if each label exists in the healthLabels column of the data dataset. The result is a matrix with TRUE values where the label is present and FALSE values where it is not.
encoded_data <- sapply(unique_labels, function(label) grepl(label, data$healthLabels))

# The as.data.frame() function is used to convert the logical matrix encoded_data into a data frame. This allows for easier manipulation and analysis of the encoded data.
encoded_data <- as.data.frame(encoded_data)


# This line combines the original data dataset with the newly created encoded_data data frame column-wise using the cbind() function. The resulting data dataset now contains the original columns along with the encoded labels.
data <- cbind(data, encoded_data)


########################################
#                                      #
#       DIET                           #
#                                      #
########################################

# This line takes the dietLabels column from the data dataset and splits the values using a comma as the delimiter. The resulting values are stored in the all_labels variable as a character vector.
all_dietLabels <- unlist(strsplit(data$dietLabels, ", "))

# Here, the unique() function is used to extract unique values from the all_dietLabels vector. These unique dietLabels are stored in the unique_dietLabels variable.
unique_dietLabels <- unique(all_dietLabels)


# This line creates a logical matrix encoded_data by applying the grepl() function to each unique label in unique_labels. The grepl() function checks if each dietLabel exists in the dietLabels column of the data dataset. The result is a matrix with TRUE values where the label is present and FALSE values where it is not.
encoded_data <- sapply(unique_dietLabels, function(label) grepl(label, data$dietLabels))

# The as.data.frame() function is used to convert the logical matrix encoded_data into a data frame. This allows for easier manipulation and analysis of the encoded data.
encoded_data <- as.data.frame(encoded_data)


# This line combines the original data dataset with the newly created encoded_data data frame column-wise using the cbind() function. The resulting data dataset now contains the original columns along with the encoded labels.
data <- cbind(data, encoded_data)
# First convert the 'rawGPTRank' column to character
data$rawGPTRank <- as.character(data$rawGPTRank)

# Identify rows that contain only numeric characters or decimal points in the 'gptDescription' column
numeric_rows <- grepl("^[0-9.]+$", data$rawGPTRank)

# Subset the dataframe to include only these rows
data <- data[numeric_rows, ]
# This script first identifies all numeric columns in your dataframe except for 'rawGPTRank'. It then creates a formula string for the regression model, which includes 'rawGPTRank' as the dependent variable and all other numeric columns as the independent variables. This formula is used to fit a linear regression model using the lm() function. Finally, it prints a summary of the model, which includes the coefficients of the regression, the residuals, and various statistics that help to interpret the model's fit.


# Get the names of all numeric columns except 'rawGPTRank'
numeric_vars <- sapply(data, is.numeric)
numeric_vars["rawGPTRank"] <- FALSE
numeric_cols <- names(numeric_vars)[numeric_vars]

# Create the formula for the regression model
regression_formula <- as.formula(paste("rawGPTRank ~", paste(numeric_cols, collapse = " + ")))

# Run the linear regression model
model <- lm(regression_formula, data = data)

# Print the summary of the model
summary(model)
## 
## Call:
## lm(formula = regression_formula, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.6250  0.0000  0.0000  0.3125  1.3750 
## 
## Coefficients: (21 not defined because of singularities)
##                                        Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                            31.27509    9.50493   3.290  0.00164 **
## calories                                6.04930    2.05338   2.946  0.00451 **
## Energy_kcal                          -126.71458   46.06711  -2.751  0.00776 **
## Total.lipid..fat._g                  -110.44983  109.93764  -1.005  0.31890   
## Fatty.acids..total.saturated_g       1952.29834  685.47689   2.848  0.00593 **
## Fatty.acids..total.trans_g           -143.21922   55.39990  -2.585  0.01206 * 
## Fatty.acids..total.monounsaturated_g  -11.66551    5.51019  -2.117  0.03821 * 
## Fatty.acids..total.polyunsaturated_g  -11.27790    4.79389  -2.353  0.02178 * 
## Carbohydrate..by.difference_g         185.08042  113.13661   1.636  0.10685   
## Carbohydrates..net._g                -681.53293  248.75193  -2.740  0.00799 **
## Fiber..total.dietary_g               -679.80366  247.97234  -2.741  0.00795 **
## Sugars..total_g                         1.29942    0.47096   2.759  0.00758 **
## Protein_g                               4.53735    1.77904   2.550  0.01320 * 
## Cholesterol_mg                         -2.59453    0.94834  -2.736  0.00807 **
## Sodium..Na_mg                           0.02976    0.01116   2.665  0.00975 **
## Calcium..Ca_mg                         -0.01420    0.01194  -1.190  0.23859   
## Magnesium..Mg_mg                       -0.38838    0.13803  -2.814  0.00653 **
## Potassium..K_mg                         0.04488    0.01316   3.410  0.00114 **
## Iron..Fe_mg                             1.48292    0.55607   2.667  0.00972 **
## Zinc..Zn_mg                             4.90079    1.92445   2.547  0.01333 * 
## Phosphorus..P_mg                       -0.11459    0.04681  -2.448  0.01717 * 
## Vitamin.A..RAE_µg                      -0.04541    0.01591  -2.854  0.00583 **
## Vitamin.C..total.ascorbic.acid_mg      -3.44057    1.25582  -2.740  0.00799 **
## Thiamin_mg                              4.53000    2.77818   1.631  0.10797   
## Riboflavin_mg                          -7.79950    2.63311  -2.962  0.00431 **
## Niacin_mg                              -5.11805    1.87923  -2.723  0.00835 **
## Vitamin.B.6_mg                         18.15867    7.34980   2.471  0.01621 * 
## Folate..DFE_µg                         58.51579   20.82501   2.810  0.00659 **
## Folate..food_µg                       -59.68778   21.25060  -2.809  0.00661 **
## Folic.acid_µg                         -99.51582   35.41547  -2.810  0.00659 **
## Vitamin.B.12_µg                         1.37699    0.49790   2.766  0.00745 **
## Vitamin.D..D2...D3._µg                  4.75141    1.70265   2.791  0.00695 **
## Vitamin.E..alpha.tocopherol._mg         6.86078    2.51417   2.729  0.00823 **
## Vitamin.K..phylloquinone._µg           -4.10880    1.60858  -2.554  0.01307 * 
## Water_g                                 0.58664    0.23753   2.470  0.01624 * 
## Energy_pct                           2412.12504  883.37226   2.731  0.00819 **
## Fat_pct                                76.33909   72.93711   1.047  0.29926   
## Saturated_pct                        -387.36367  136.10420  -2.846  0.00597 **
## Carbs_pct                            1489.78256  461.46497   3.228  0.00198 **
## Fiber_pct                                    NA         NA      NA       NA   
## Protein_pct                                  NA         NA      NA       NA   
## Cholesterol_pct                              NA         NA      NA       NA   
## Sodium_pct                                   NA         NA      NA       NA   
## Calcium_pct                                  NA         NA      NA       NA   
## Magnesium_pct                                NA         NA      NA       NA   
## Potassium_pct                                NA         NA      NA       NA   
## Iron_pct                                     NA         NA      NA       NA   
## Zinc_pct                                     NA         NA      NA       NA   
## Phosphorus_pct                               NA         NA      NA       NA   
## Vitamin.A_pct                                NA         NA      NA       NA   
## Vitamin.C_pct                                NA         NA      NA       NA   
## Thiamin..B1._pct                             NA         NA      NA       NA   
## Riboflavin..B2._pct                          NA         NA      NA       NA   
## Niacin..B3._pct                              NA         NA      NA       NA   
## Vitamin.B6_pct                               NA         NA      NA       NA   
## Folate.equivalent..total._pct                NA         NA      NA       NA   
## Vitamin.B12_pct                              NA         NA      NA       NA   
## Vitamin.D_pct                                NA         NA      NA       NA   
## Vitamin.E_pct                                NA         NA      NA       NA   
## Vitamin.K_pct                                NA         NA      NA       NA   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7068 on 63 degrees of freedom
##   (37 observations deleted due to missingness)
## Multiple R-squared:  0.7511, Adjusted R-squared:  0.601 
## F-statistic: 5.004 on 38 and 63 DF,  p-value: 9.59e-09
#$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
#$                           $
#$      T  H  A  N  K        $
#$        Y  O  U            $
#$                           $
#$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$

# Function to print a big "THANK YOU" using "#" characters
print_big_thank_you <- function() {
  # Define the letters of "THANK YOU" represented in "#" characters
  letters <- list(
    "T" = c("#####", "  #  ", "  #  ", "  #  "),
    "H" = c("#   #", "#   #", "#####", "#   #"),
    "A" = c(" ## ", "#  #", "#####", "#   #"),
    "N" = c("#   #", "##  #", "# # #", "#  ##"),
    "K" = c("#   #", "#  # ", "##   ", "#  # "),
    "Y" = c("#   #", " # # ", "  #  ", "  #  "),
    "O" = c("#####", "#   #", "#   #", "#####"),
    "U" = c("#   #", "#   #", "#   #", "#####"),
    " " = c("     ", "     ", "     ", "     ")
  )
  
  # Print each row of the big "THANK YOU"
  for (row in 1:4) {
    for (letter in c("T", "H", "A", "N", "K", " ", "Y", "O", "U")) {
      cat(letters[[letter]][row], " ")
    }
    cat("\n")
  }
}

# Call the function to print the big "THANK YOU"
print_big_thank_you()
## #####  #   #   ##   #   #  #   #         #   #  #####  #   #  
##   #    #   #  #  #  ##  #  #  #           # #   #   #  #   #  
##   #    #####  #####  # # #  ##              #    #   #  #   #  
##   #    #   #  #   #  #  ##  #  #            #    #####  #####